In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline
# General libraries.
import re
import time as time
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import matplotlib.cm as cm
from matplotlib.ticker import FormatStrFormatter
from itertools import product
import pandas as pd
from IPython.display import display, HTML
# feature analysis and selection
from sklearn.decomposition import PCA, KernelPCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction import DictVectorizer
# Preprocessing
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, OneHotEncoder, Imputer
from sklearn.model_selection import train_test_split, cross_val_score
# Processing
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn import metrics
# SKLearn
from statsmodels.regression.linear_model import OLS
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
#from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
Data Fields
SOC, pH, Ca, P, Sand are the five target variables for predictions. The data have been monotonously transformed from the original measurements and thus include negative values.
PIDN: unique soil sample identifier
SOC: Soil organic carbon
P: Mehlich-3 extractable Phosphorus
Sand: Sand content
m7497.96 - m599.76: There are 3,578 mid-infrared absorbance measurements. For example, the "m7497.96" column is the absorbance at wavenumber 7497.96 cm-1. We suggest you to remove spectra CO2 bands which are in the region m2379.76 to m2352.76, but you do not have to.
Depth: Depth of the soil sample (2 categories: "Topsoil", "Subsoil")
Some potential spatial predictors from remote sensing data sources are included. Short variable descriptions are provided below and additional descriptions can be found at AfSIS data. The data have been mean centered and scaled.
In [2]:
# Load training data
X = np.genfromtxt('training.csv',
delimiter=',',
dtype=None,
skip_header = 1,
usecols=range(1, 3594)) # Load columns 1 to 3594 inclusive
n = np.genfromtxt('training.csv',
delimiter=',',
max_rows = 1,
names = True,
usecols=range(1, 3594)) # Load columns 1 to 3594 inclusive
feature_names = np.asarray(n.dtype.names)
Depth = np.genfromtxt('training.csv',
delimiter=',',
dtype=None,
skip_header = 1,
usecols=3594) # Load Depth values
PIDN = np.genfromtxt('training.csv',
delimiter=',',
dtype=None,
skip_header = 1,
usecols=0) # Load the PIDN for reference
Ca = np.genfromtxt('training.csv',
delimiter=',',
dtype=None,
skip_header = 1,
usecols=3595) # Load Mehlich-3 extractable Calcium data
P = np.genfromtxt('training.csv',
delimiter=',',
dtype=None,
skip_header = 1,
usecols=3596) # Load Mehlich-3 extractable Phosphorus data
pH = np.genfromtxt('training.csv',
delimiter=',',
dtype=None,
skip_header = 1,
usecols=3597) # Load pH data
SOC = np.genfromtxt('training.csv',
delimiter=',',
dtype=None,
skip_header = 1,
usecols=3598) # Load Soil Organic Carbon data
Sand = np.genfromtxt('training.csv',
delimiter=',',
dtype=None,
skip_header = 1,
usecols=3599) # Load Sand Content data
# Outcome (or response) variable list
y_var_labels = ['Ca', 'P', 'pH', 'SOC', 'Sand']
y_vars = [Ca, P, pH, SOC, Sand]
# Color map for outcome variables
colors = ['orange', 'yellowgreen', 'powderblue', 'sienna', 'tan']
In [3]:
# Load test data
test_x = np.genfromtxt('sorted_test.csv',
delimiter=',',
dtype=None,
skip_header = 1,
usecols=range(1, 3594)) # Load columns 0 to 3594 inclusive
test_depth = np.genfromtxt('sorted_test.csv',
delimiter=',',
dtype=None,
skip_header = 1,
usecols=3594) # Load Depth values
test_ids = np.genfromtxt('sorted_test.csv',
delimiter=',',
dtype=None,
skip_header = 1,
usecols=0) # Load columns 0 to 3594 inclusive
In [4]:
# Transform depth and concatenate to X and test_x for use
le = LabelEncoder()
depth_enc = le.fit(Depth).transform(Depth).astype(np.float64)
test_depth_enc = le.fit(test_depth).transform(test_depth).astype(np.float64)
X_wDepth = np.concatenate((X, depth_enc.reshape(1,-1).T), axis=1)
test_x_wdepth = np.concatenate((test_x, test_depth_enc.reshape(1,-1).T), axis=1)
In [5]:
# Inspect the data shapes
print "Training data shape: ", X.shape
print "Feature name shape: ", feature_names.shape
print "PIDN data shape: ", PIDN.shape
print "Depth data shape: ", Depth.shape
print "Ca data shape: ", Ca.shape
print "P data shape: ", P.shape
print "pH data shape: ", pH.shape
print "SOC data shape: ", SOC.shape
print "Sand data shape: ", Sand.shape
print "Test data shape: ", test_x.shape
print "Test_ids shape: ", test_ids.shape
In [6]:
# Inspect the data in the five response variables
print "Ca: total = %d, max = %0.2f, mean = %0.2f, min = %0.2f" % (Ca.shape[0], np.max(Ca), np.mean(Ca), np.min(Ca))
print "P: total = %d, max = %0.2f, mean = %0.2f, min = %0.2f" % (P.shape[0], np.max(P), np.mean(P), np.min(P))
print "pH: total = %d, max = %0.2f, mean = %0.2f, min = %0.2f" % (pH.shape[0], np.max(pH), np.mean(pH), np.min(pH))
print "SOC: total = %d, max = %0.2f, mean = %0.2f, min = %0.2f" % (SOC.shape[0], np.max(SOC), np.mean(SOC), np.min(SOC))
print "Sand: total = %d, max = %0.2f, mean = %0.2f, min = %0.2f" % (Sand.shape[0], np.max(Sand),
np.mean(Sand), np.min(Sand))
def plot_hist(ind, data, max_y, title, color):
counts, bins, patches = ax[ind].hist(data, facecolor=color, edgecolor='gray')
# set the ticks to be at the edges of the bins.
ax[ind].set_xticks(bins)
# set the limits for x and y
ax[ind].set_xlim([np.min(data),np.max(data)])
ax[ind].set_ylim([0,max_y])
# set the xaxis's tick labels to be formatted with 1 decimal place
ax[ind].xaxis.set_major_formatter(FormatStrFormatter('%0.1f'))
ax[ind].set_title(title, fontsize=18)
# Label the raw counts and the percentages below the x-axis
bin_centers = 0.5 * np.diff(bins) + bins[:-1]
for count, x in zip(counts, bin_centers):
# Label the raw counts
ax[ind].annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -18), textcoords='offset points', va='top', ha='center')
# Label the percentages
percent = '%0.1f%%' % (100 * float(count) / counts.sum())
ax[ind].annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),
xytext=(0, -32), textcoords='offset points', va='top', ha='center')
fig, ax = plt.subplots(3, 2, figsize=(15, 20))
fig.subplots_adjust(hspace = 0.5, wspace=.2)
ax = ax.ravel()
# Ca
plot_hist(0, Ca, Ca.shape[0], 'Ca Value Histogram', colors[0])
# P
plot_hist(1, P, P.shape[0], 'P Value Histogram', colors[1])
#pH
plot_hist(2, pH, pH.shape[0], 'pH Value Histogram', colors[2])
#SOC
plot_hist(3, SOC, SOC.shape[0], 'SOC Value Histogram', colors[3])
#Sand
plot_hist(4, Sand, Sand.shape[0], 'Sand Value Histogram', colors[4])
# delete the last subplot
fig.delaxes(ax[5])
In [7]:
# Inspect the data in the predictor variables
def plot_data(ind, data_x, data_y, aspect, title, color):
ax[ind].set_title(title, fontsize=18)
ax[ind].set_xlabel('Predictor Values', fontsize=14)
ax[ind].set_ylabel('Ca Values', fontsize=12)
ax[ind].set_aspect(aspect = aspect, adjustable='box')
ax[ind].grid(True)
ax[ind].scatter(data_x, data_y, color = color, alpha = 0.2, marker = 'o', edgecolors = 'black')
# set up the grid plot
fig, ax = plt.subplots(2, 3, figsize=(15, 20))
#fig.subplots_adjust(hspace = 0.5, wspace=.2)
ax = ax.ravel()
# select the predictor range (note this is influenced by the PCA below)
my_col = 20
X_sub = np.ravel(X[:,:my_col].reshape(-1,1))
# Ca
plot_data(0, X_sub, np.repeat(Ca, my_col), 0.1, 'Ca vs. %d Predictors' % my_col, colors[0])
# P
plot_data(1, X_sub, np.repeat(P, my_col), 0.1, 'P vs. %d Predictors' % my_col, colors[1])
#pH
plot_data(2, X_sub, np.repeat(pH, my_col), 0.2, 'pH vs. %d Predictors' % my_col, colors[2])
#SOC
plot_data(3, X_sub, np.repeat(SOC, my_col), 0.1, 'SOC vs. %d Predictors' % my_col, colors[3])
#Sand
plot_data(4, X_sub, np.repeat(Sand, my_col), 0.35, 'Sand vs. %d Predictors' % my_col, colors[4])
# delete the last subplot
fig.delaxes(ax[5])
Which features have more impact?
There are over three thousand features in this data, with few rows. Thus, we have a large k but small n data set to work with. Perhaps there is a subset of features to focus on.
Below, we investigate two variations of PCA to explain variances over the features. We observe that the first 20 components explain increasing portions of the variance, however after 20 components, the subsequent ones don't really help. The first 70-80 features will explain ~100% of the variance.
In [8]:
# Linear PCA using all of the features
n_comp = feature_names.shape[0]
pca_lin = PCA(n_components = n_comp)
pca_lin.fit(X)
pca_lin_cumsum = np.cumsum(pca_lin.explained_variance_ratio_)
# Non-linear kernel RBF PCA using all of the features
pca_kern = KernelPCA(n_components = n_comp, kernel = 'rbf')
pca_kern.fit(X)
# build the explained variance ratio list for pca_kern
explained_var_ratio_kern = []
for i in range(0, pca_kern.lambdas_.shape[0]):
explained_var_ratio_kern.append(pca_kern.lambdas_[i]/sum(pca_kern.lambdas_))
pca_kern_cumsum = np.cumsum(np.asarray(explained_var_ratio_kern))
# Plot the Information Gain graph
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(pca_lin_cumsum, color = 'purple', marker = 'o', ms = 5, mfc = 'red', label = 'pca_lin')
ax.plot(pca_kern_cumsum, color = 'purple', marker = 'o', ms = 5, mfc = 'yellow', label = 'pca_kern')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.91), shadow=False, scatterpoints=1)
fig.suptitle('Cummulative Information Gain', fontsize=18)
plt.xlabel('Number of Components', fontsize=14)
plt.ylabel('Cummulative Variance Ratio', fontsize=12)
plt.grid(True)
ax.set_xlim([0,30])
ax.set_ylim([0.5,1.0])
# Output variance fractions
print '\n-------------------------------------------'
print 'Fraction of the total variance in the training explained by first k components: \n'
for k in range(1,76):
print("%d \t %s \t %s" % (k, '{0:.2f}%'.format(pca_lin_cumsum[k-1] * 100),
'{0:.2f}%'.format(pca_kern_cumsum[k-1] * 100)))
In [9]:
# Linear Regression with PCA combinations
y_pipelines_lin = []
y_scores_lin = []
start = time.time()
for ind, y in enumerate(y_vars):
X_train, X_test, y_train, y_test = train_test_split(X, y)
# set up the train and test data
print '\n----------', y_var_labels[ind]
pca = PCA()
linear = LinearRegression()
steps = [('pca', pca), ('linear', linear)]
pipeline = Pipeline(steps)
parameters = dict(pca__n_components=list(range(20, 90, 10)))
cv = GridSearchCV(pipeline, param_grid=parameters, verbose=0)
cv.fit(X_train, y_train)
print 'Cross_val_score: ', cross_val_score(cv, X_test, y_test)
y_predictions = cv.predict(X_test)
mse = mean_squared_error(y_test, y_predictions)
print 'Explained variance score: ', explained_variance_score(y_test, y_predictions)
print 'Mean absolute error: ', mean_absolute_error(y_test, y_predictions)
print 'Mean squared error: ', mse
print 'R2 score: ', r2_score(y_test, y_predictions)
display(pd.DataFrame.from_dict(cv.cv_results_))
# capture the best pipeline estimator and mse value
y_pipelines_lin.append(cv.best_estimator_)
y_scores_lin.append(mse)
print '\nCompleted in %0.2f sec.' % (time.time()-start)
In [10]:
print len(y_pipelines_lin)
print y_scores_lin
In [11]:
# Linear Regression with PCA and SelectKBest
y_pipelines_linsel = []
y_scores_linsel = []
start = time.time()
for ind, y in enumerate(y_vars):
X_train, X_test, y_train, y_test = train_test_split(X, y)
# set up the train and test data
print '\n----------', y_var_labels[ind]
pca = PCA(n_components=2)
selection = SelectKBest(k=1)
combined_features = FeatureUnion([('pca', pca), ('univ_select', selection)])
linear = LinearRegression()
steps = [('features', combined_features), ('linear', linear)]
pipeline = Pipeline(steps)
parameters = dict(features__pca__n_components=list(range(20, 90, 10)),
features__univ_select__k=[1, 2, 3])
cv = GridSearchCV(pipeline, param_grid=parameters, verbose=0)
cv.fit(X_train, y_train)
print 'Cross_val_score: ', cross_val_score(cv, X_test, y_test)
y_predictions = cv.predict(X_test)
mse = mean_squared_error(y_test, y_predictions)
print 'Explained variance score: ', explained_variance_score(y_test, y_predictions)
print 'Mean absolute error: ', mean_absolute_error(y_test, y_predictions)
print 'Mean squared error: ', mse
print 'R2 score: ', r2_score(y_test, y_predictions)
display(pd.DataFrame.from_dict(cv.cv_results_))
# capture the best pipeline estimator and mse value
y_pipelines_linsel.append(cv.best_estimator_)
y_scores_linsel.append(mse)
print '\nCompleted in %0.2f sec.' % (time.time()-start)
In [12]:
print len(y_pipelines_linsel)
print y_scores_linsel
In [13]:
# Ridge Regression with PCA combinations
y_pipelines_ridge = []
y_scores_ridge = []
start = time.time()
for ind, y in enumerate(y_vars):
X_train, X_test, y_train, y_test = train_test_split(X, y)
# set up the train and test data
print '\n----------', y_var_labels[ind]
pca = PCA()
ridge = Ridge()
steps = [('pca', pca), ('ridge', ridge)]
pipeline = Pipeline(steps)
parameters = dict(pca__n_components=list(range(20, 90, 10)),
ridge__alpha=np.linspace(0.0, 0.5, 5))
cv = GridSearchCV(pipeline, param_grid=parameters, verbose=0)
cv.fit(X_train, y_train)
print 'Cross_val_score: ', cross_val_score(cv, X_test, y_test)
y_predictions = cv.predict(X_test)
mse = mean_squared_error(y_test, y_predictions)
print 'Explained variance score: ', explained_variance_score(y_test, y_predictions)
print 'Mean absolute error: ', mean_absolute_error(y_test, y_predictions)
print 'Mean squared error: ', mse
print 'R2 score: ', r2_score(y_test, y_predictions)
display(pd.DataFrame.from_dict(cv.cv_results_))
# capture the best pipeline estimator and mse value
y_pipelines_ridge.append(cv.best_estimator_)
y_scores_ridge.append(mse)
print '\nCompleted in %0.2f sec.' % (time.time()-start)
In [14]:
print len(y_pipelines_ridge)
print y_scores_ridge
In [15]:
# SVR with PCA combinations
y_pipelines_svr = []
y_scores_svr = []
start = time.time()
for ind, y in enumerate(y_vars):
X_train, X_test, y_train, y_test = train_test_split(X, y)
# set up the train and test data
print '\n----------', y_var_labels[ind]
pca = PCA()
svr = SVR()
steps = [('pca', pca), ('svr', svr)]
pipeline = Pipeline(steps)
parameters = dict(pca__n_components=list(range(20, 90, 10)),
svr__kernel=list(['rbf']),
svr__C=list([1e3]))
cv = GridSearchCV(pipeline, param_grid=parameters, verbose=0)
cv.fit(X_train, y_train)
print 'Cross_val_score: ', cross_val_score(cv, X_test, y_test)
y_predictions = cv.predict(X_test)
mse = mean_squared_error(y_test, y_predictions)
print 'Explained variance score: ', explained_variance_score(y_test, y_predictions)
print 'Mean absolute error: ', mean_absolute_error(y_test, y_predictions)
print 'Mean squared error: ', mse
print 'R2 score: ', r2_score(y_test, y_predictions)
display(pd.DataFrame.from_dict(cv.cv_results_))
# capture the best pipeline estimator and mse value
y_pipelines_svr.append(cv.best_estimator_)
y_scores_svr.append(mse)
print '\nCompleted in %0.2f sec.' % (time.time()-start)
In [16]:
print len(y_pipelines_svr)
print y_scores_svr
In [17]:
# Pick out the best performing models/pipelines based on mse for each predictor
# combine results lists from modeling cells
y_vars_pipelines = [y_pipelines_lin, y_pipelines_linsel, y_pipelines_ridge, y_pipelines_svr]
y_vars_scores = [y_scores_lin, y_scores_linsel, y_scores_ridge, y_scores_svr]
pipelines = np.array(y_vars_pipelines)
scores = np.array(y_vars_scores)
pipeline_winners = []
print pipelines.shape
# P sucks
print scores
for ind, y in enumerate(y_vars):
# get index of best score
best_ind = np.argmin(scores[:,ind])
print(best_ind, ind)
# capture the pipeline
pipeline_winners.append(pipelines[best_ind, ind])
print len(pipeline_winners)
In [18]:
# Iterate through test samples
allPredictions = []
pipeline_winners = y_pipelines_lin
for s_ind in range(len(test_x)):
sampleId = test_ids[s_ind]
sample = test_x[s_ind]
currentSamplePredictions = []
# Use the winning model to estimate the outcome variables
for ind in range(0, 5):
pred = pipeline_winners[ind].predict(sample.reshape(1,-1))[0]
currentSamplePredictions.append(pred)
allPredictions.append(currentSamplePredictions)
#print len(allPredictions)
#print allPredictions
print 'Predictions calculated.'
In [21]:
# Generate csv for AfricaSoil Kaggle
filename = 'jsccjc_20170423_2.csv'
# Clean file
open(filename, 'w').close()
with open(filename, 'w') as f:
f.write('PIDN,Ca,P,pH,SOC,Sand\n') # python will convert \n to os.linesep
# Iterate through test samples
for i in range(len(allPredictions)):
pred = allPredictions[i]
testId = test_ids[i]
text = testId + ',' + str(pred[0]) + ',' + str(pred[1]) + ',' + str(pred[2]) + ',' + str(pred[3]) + ',' + str(pred[4]) + '\n'
f.write(text)
f.close()
In [20]:
# Check where jupyter may drop the csv if it can't be found where expected
import os
fileDir = os.path.dirname(os.path.realpath('__file__'))
print fileDir
NOTES
Below is a series of notes collected throughout the course of this final project. They are captured as references.
Options for high dimensional data where large number of features and fewer number of observations: can choose random sets of variables and asses their importance using cross-validation; ridge regression, the lasso or elastic net for regularization (process of introducing additional information in order to solve an ill-posed problem or to prevent overfitting); choose a technique, such as a support vector machine or random forest that deals well with a large number of predictors. Refer to reference list for original source.
LASSO (least absolute shrinkage and selection operator) is a regression analysis method that performs both variable selection and regularization in order to enhance the prediction accuracy and interpretability of the statistical model it produces.
When considering ML methods, consider:
Always start simple: first algorithm to try would be naive Bayes, logistic regression, k-nearest neighbour (First start with one neighbour) and Fisher's linear discriminant before anything else. For advanced machine learning, ensemble methods are the ones that produces the best results as is shown by winners in kaggale competition and XGBOOST has been very popular among the kaggale winners. Neural Networks may be useful for predicting values but number of observations is low. Refer to reference list for original source.
Subject: dirt quality for agriculture, Predictor variables: 3593 features (see feature_names), Response variables: 'Ca', 'P', 'pH', 'Soc', 'Sand
A continuous predictor variable is sometimes called a covariate and a categorical predictor variable is sometimes called a factor. For exampe, in a cake experiment a covariate could be various oven temperatures and a factor could be different ovens. Usually, you create a plot of predictor variables on the x-axis and response variables on the y-axis. Refer to reference list for original source.
For continuous variables such as income, it is customary to do a log transformation to get it as close to a normal distribution as possible. You can then employ OLS and run some diagnostics to check your model fit. For other types of continuous variables, get a histogram and check the distribution. If it is somewhat normal, you can run an OLS and check the diagnostics and model fit. Refer to reference list for original source.
General References:
SKLearn References: